from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import cufflinks as cf
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import EngFormatter
# MySQL-connector-python
engine = create_engine('mysql+mysqlconnector://<user>:<password>@igor.gold.ac.uk/<databasename>')
connection = engine.connect()
df = pd.read_sql_query("""SELECT year, COUNT(*) FROM races GROUP BY year;""", engine)
df.rename(columns={'COUNT(*)':'numofraces',},inplace=True)
df.head()
df.dtypes
df.set_index(['year'],inplace=True)
#Create scatter plot combined with bar plot
trace1 = go.Scatter(y=df['numofraces'],
x=df.index,
name='Teams',
mode = 'lines+markers',
fill='tonexty')
#Establish data to be used in plot
data = [trace1]
#Set layout configuration, title, xaxis, yaxis, range for yvalues
layout = go.Layout(
title='Races per Championship',
autosize=False,
width=850,
height=350,
xaxis=dict(
title='Year'),
yaxis=dict(
title='Number',
autorange=True,
# range=[0,120]),
)
)
#create plot figure
fig = go.Figure(data=data, layout=layout)
#show plot
plotly.offline.iplot(fig)
# Summary Stats
print(df.describe())
print('median ',(df['numofraces'].median()))
df2=pd.read_sql_query("""SELECT races.year,
races.raceId,
results.constructorId
FROM results
INNER JOIN races ON races.raceId = results.raceId
""", engine)
df2.head()
df2.columns
df2 = df2.groupby('year')['constructorId'].nunique()
df3=pd.read_sql_query("""SELECT races.year,
races.raceId,
results.driverId
FROM results
INNER JOIN races ON races.raceId = results.raceId
""", engine)
df3.head()
df3.columns
df3 = df3.groupby('year')['driverId'].nunique()
df2 = pd.DataFrame(df2)
df3 = pd.DataFrame(df3)
df4 = df2.join(df3, how='outer')
df4_2 = df2.join(df3, how='outer')
df4_2.head()
df4['driverperteam'] = df4['driverId']/df4['constructorId']
df4.head()
df4_2.rename(columns={'constructorId':'Teams','driverId':'Drivers'},inplace=True)
df_4_2_Stats=df4_2[['Teams','Drivers']].describe()
df_4_2_Stats['Driver/team'] = df4['driverperteam'].describe()
#Create scatter plot combined with bar plot
trace1 = go.Scatter(y=df4_2['Teams'],name='Teams',x=df4_2.index,mode = 'markers')
trace2 = go.Scatter(y=df4_2['Drivers'],name='Drivers',x=df4_2.index)
trace3 = go.Bar(y=df4['driverperteam'],name='Drivers per Team',yaxis='y2',opacity=0.3,x=df4.index,)
#Establish data to be used in plot
data = [trace3,trace1,trace2]
#Set layout configuration, title, xaxis, yaxis, specific range for yvalues
layout = go.Layout(
title='Relationship of number and drivers and teams per Year',
legend=dict(orientation="h",x=-.1, y=1.2),
autosize=False,
width=850,
height=350,
xaxis=dict(
title='Year'),
yaxis=dict(
title='Number',
autorange=False,
range=[0,120]),
yaxis2=dict(
title='Drivers Per team',
autorange=False,
range=[0,12],
titlefont=dict(color='rgb(0, 0, 0)'),
tickfont=dict(color='rgb(0, 0, 0)'),
showgrid=False,
overlaying='y',
side='right')
)
#create plot figure
fig = go.Figure(data=data, layout=layout)
#show plot
plotly.offline.iplot(fig)
print(df_4_2_Stats)
del df4['constructorId']
del df4['driverId']
df5=pd.read_sql_query("""SELECT races.year,
races.name,
results.fastestLapSpeed
FROM results
INNER JOIN races ON races.raceId = results.raceId
WHERE results.position = 1
""", engine)
df5.head()
df5['fastestLapSpeed'] = pd.to_numeric(df5['fastestLapSpeed'],errors="coerce")
df5['name'] = df5['name'].astype('category')
df5.dtypes
table = pd.pivot_table(df5,index =['name'] ,columns=['year'])
table_to_plot = table.T
box_speed = table_to_plot.dropna(axis=1, how='all')
box_speed = box_speed.swaplevel(0, 1, axis=0)
box_speed.rename(columns={'Abu Dhabi Grand Prix':'Abu_Dhabi','Australian Grand Prix':'Australia'
,'Austrian Grand Prix':'Austria','Azerbaijan Grand Prix':'Azerbaijan'
,'Bahrain Grand Prix':'Bahrain','Belgian Grand Prix':'Belgium'
,'Brazilian Grand Prix':'Brazil','British Grand Prix':'UK'
,'Canadian Grand Prix':'Canada','Chinese Grand Prix':'China'
,'European Grand Prix':'Europe','French Grand Prix':'France'
,'German Grand Prix':'Germany','Hungarian Grand Prix':'Hungary'
,'Indian Grand Prix':'India','Italian Grand Prix':'Italy'
,'Japanese Grand Prix':'Japan','Korean Grand Prix':'Korea'
,'Malaysian Grand Prix':'Malaysia','Mexican Grand Prix':'Mexico'
,'Monaco Grand Prix':'Monaco','Russian Grand Prix':'Russia'
,'San Marino Grand Prix':'SanMarino','Singapore Grand Prix':'Singarope'
,'Spanish Grand Prix':'Spain','Turkish Grand Prix':'Turkey'
,'United States Grand Prix':'USA'
},inplace=True)
#box_speed.T.mean()
#Create box plot of average speed in fastest lap
#Add series as an element, use the mean and add some jitter to data points
Italy = go.Box(y=box_speed['Italy'],name='Italy',boxmean=True)
Belgium = go.Box(y=box_speed['Belgium'],name='Belgium',boxmean=True)
UK = go.Box(y=box_speed['UK'],name='UK',boxmean=True)
Austria = go.Box(y=box_speed['Austria'],name='Austria',boxmean=True)
Turkey = go.Box(y=box_speed['Turkey'],name='Turkey',boxmean=True)
Australia = go.Box(y=box_speed['Australia'],name='Australia',boxmean=True)
Japan = go.Box(y=box_speed['Japan'],name='Japan',boxmean=True)
SanMarino = go.Box(y=box_speed['SanMarino'],name='SanMarino',boxmean=True)
Russia = go.Box(y=box_speed['Russia'],name='Russia',boxmean=True)
India = go.Box(y=box_speed['India'],name='India',boxmean=True)
#Establish data to be used in plot
data = [Italy, Belgium, UK, Austria, Turkey, Australia, Japan, SanMarino, Russia, India]
#Set layout configuration, title, xaxis, yaxis, specific range for yvalues
layout = go.Layout(
title='Average speed in km/h of the Fastest Lap of the winner of the race',
autosize=False,
width=850,
height=450,
xaxis=dict(
title='Grand Prix',
type='category'
),
yaxis=dict(
title='km/h',
autorange=True,
nticks=5
),
)
#create plot figure
fig = go.Figure(data=data, layout=layout)
#show plot
plotly.offline.iplot(fig)
print(box_speed.describe().T.sort_values(by=['mean'],ascending=False).iloc[0:10,]['mean'].describe())
df6=pd.read_sql_query("""
SELECT races.year,
races.name,
status.statusId,
status.status
FROM status
INNER JOIN results ON status.statusId = results.statusId
INNER JOIN races ON races.raceId = results.raceId
WHERE results.statusId <> 1
""", engine)
df6.head()
df6['status'] = df6['status'].astype('category')
df6['name'] = df6['name'].astype('category')
df6.dtypes
year_breakdown = df6
del year_breakdown['name']
del year_breakdown['statusId']
year_breakdown = pd.crosstab(year_breakdown.status, year_breakdown.year, margins=False)
year_breakdown = year_breakdown.T
year_breakdown = year_breakdown[year_breakdown.columns.drop(list(year_breakdown.filter(regex='Lap')))]
year_breakdown = year_breakdown[year_breakdown.columns.drop(list(year_breakdown.filter(regex='Laps')))]
year_breakdown = year_breakdown.sort_index(axis=0 ,ascending=False)
year_breakdown = year_breakdown.sort_index(axis=1 ,ascending=False)
breakdown_evolution = pd.DataFrame(year_breakdown.iloc[58:68,].sum(),columns=['50-59'])
breakdown_evolution['60-69'] = year_breakdown.iloc[48:58,].sum()
breakdown_evolution['70-79'] = year_breakdown.iloc[38:48,].sum()
breakdown_evolution['80-89'] = year_breakdown.iloc[28:38,].sum()
breakdown_evolution['90-99'] = year_breakdown.iloc[18:28,].sum()
breakdown_evolution['00-09'] = year_breakdown.iloc[8:18,].sum()
breakdown_evolution['10-17'] = year_breakdown.iloc[0:8,].sum()
breakdown_evolution.head()
breakdown_evolution = breakdown_evolution.T
breakdown_pareto = pd.DataFrame(breakdown_evolution.mean().sort_values(ascending=False),columns=['Average'])
breakdown_pareto['%'] = breakdown_evolution.mean().sort_values(ascending=False)/breakdown_evolution.mean().sort_values(ascending=False).sum()
breakdown_pareto['Pareto'] = breakdown_pareto['%'].cumsum()
#breakdown_pareto['Pareto'][0:20]
top20 = breakdown_evolution[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
,'Overheating']].copy()
# Initialize the matplotlib figure
fig, ax = plt.subplots()
fig.set_size_inches(14,7)
# Plot the data
plt.title('Top 20 reasons to not finish a race per decade \n')
ax = sns.heatmap(top20.T,robust=True,linewidths=.5,cmap='Oranges')
plt.show()
print(top20.describe().T.sort_values(by=['mean'],ascending=False)['mean'].head())
print(top20.describe().T.sort_values(by=['mean'],ascending=False)['mean'].tail())
year_breakdown = year_breakdown.reindex(index=year_breakdown.index[::-1])
# Initialize the matplotlib figure
fig, ax = plt.subplots()
fig.set_size_inches(14,7)
# Plot the data
plt.title('Most frequent reasons to quit a race per year 08-17')
ax = sns.heatmap(year_breakdown[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
,'Overheating']].iloc[58:68,].T, linewidths=.5,cmap='Oranges',yticklabels=False)
plt.show()
print(year_breakdown[['Engine','Did not qualify','Accident','Gearbox','Spun off','Collision',
'Suspension','Did not prequalify','Transmission','Electrical','Withdrew','Brakes',
'Clutch','Not classified','Fuel system','Turbo','Disqualified','Hydraulics','Ignition'
,'Overheating']].iloc[58:68,].describe().T.sort_values(by=['mean'],ascending=False)['mean'].head())
df8=pd.read_sql_query("""
SELECT races.year,
races.round,
constructors.name,
constructorStandings.position
FROM constructors
INNER JOIN constructorStandings
ON constructors.constructorId = constructorStandings.constructorId
INNER JOIN races
ON constructorStandings.raceId = races.raceId
""", engine)
#
df8.head()
df8['name'] = df8['name'].astype('category')
df8['round'] = df8['round'].astype('category')
winners = df8.copy()
winners.set_index(['year'],inplace=True)
winners = pd.crosstab(winners.name,winners.position, margins=False)
top_winners = pd.DataFrame(winners[1])
top_winners.rename(columns={'[1]':'Wins',
},inplace=True)
top_winners = top_winners.transpose()
top_winners.rename(index={1:'numofwins'}, inplace=True)
top_winners = top_winners.T.sort_values("numofwins", ascending=False)
top_winners = top_winners[(top_winners.numofwins > 0)]
top_winners.head()
sns.set(style="whitegrid")
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(12, 6))
# Plot the data
sns.barplot(palette="Reds_r",
data=top_winners.T,
orient='h',
label="Total")
# Add a legend and informative axis label
plt.title('Number of 1st place podiums in all the years \n')
ax.set(ylabel="Teams",
xlabel="Total Amount of wins")
sns.despine(left=True, bottom=True)
plt.show()
print(top_winners.head())
wins = np.array([224,165,114])
runs = np.array([951,825,685])
wins/runs
df9=pd.read_sql_query("""
SELECT *
FROM drivers
""", engine)
df9.head()
df9.shape
drivers_per_contry = df9.drop(columns=['driverRef', 'number','code','forename','surname','url','dob','driverId'])
drivers_per_contry.head()
drivers_per_contry['nationality'] = drivers_per_contry['nationality'].astype('category')
drivers_per_contry = drivers_per_contry.apply(pd.value_counts)
code = ['GBR', 'USA', 'ITA', 'FRA', 'DEU',
'BRA', 'ARG', 'ZAF', 'BEL',
'CHE', 'JPN', 'AUS', 'NLD', 'ESP',
'AUT', 'CAN', 'SWE', 'FIN',
'NZL', 'MEX', 'IRL', 'DNK', 'URY',
'PRT', 'RHA', 'COL', 'VEN',
'DEU', 'MCO', 'RUS', 'IND', 'CZE',
'HUN', 'CHL', 'IDN', 'LIE',
'MYS', 'POL', 'ARG', 'THA',
'USA']
code = pd.Series(code)
drivers_per_contry['code'] = code.values
drv_per_contry = drivers_per_contry.drop(['Argentine-Italian', 'American-Italian','East German','Rhodesian'])
drv_per_contry['nation'] = drv_per_contry.index
drv_per_contry_pct = pd.DataFrame(drv_per_contry['nationality']/drv_per_contry['nationality'].sum().copy())
#Create choropleth world map
#Establish data to be used in plot
data = dict(
type = 'choropleth',
locations = drv_per_contry['code'],
z = drv_per_contry['nationality'].astype(float),
text = drv_per_contry['nation'],
colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],
[0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
autocolorscale = False,
reversescale = True,
marker = dict(
line = dict (
color = 'rgb(220,220,220)',
width = 0.5
) ),
colorbar = dict(
title = 'Number of Drivers'),
)
data = go.Data([data])
#Set layout configuration, title, type of map
layout = dict(
title = 'Formula 1 Drivers per Country',
geo = dict(
showframe = True,
showcoastlines = True,
projection = dict(
type = 'equirectangular'
)
)
)
layout = go.Layout(layout)
#create plot figure
fig = go.Figure(data=data, layout=layout)
#show plot
plotly.offline.iplot(fig)
#Create pie chart
#Establish data to be used in plot
labels = ['British','American','Italian','Others']
values = [drv_per_contry['nationality'].iloc[0],
drv_per_contry['nationality'].iloc[1],
drv_per_contry['nationality'].iloc[2],
# drv_per_contry['nationality'].iloc[3],
(drv_per_contry['nationality'].sum()-drv_per_contry['nationality'].iloc[0:3,].sum())]
colors = ['#FF754E', '#D3A0E8', '#5AA0FF', '#66E896','#FFFB4E']
trace = go.Pie(labels=labels,
values=values,
hoverinfo='label+percent',
textinfo='label+percent',
textfont=dict(size=16),
marker=dict(colors=colors,
line=dict(color='#000000', width=1)))
#show plot
plotly.offline.iplot([trace])
import html5lib
driver_standings = pd.DataFrame()
for i in range (1950,2017):
new = pd.read_html('https://www.formula1.com/en/results.html/'+str(i)+'/drivers.html')[0]
new['year']= str(i)
driver_standings = driver_standings.append(new,ignore_index=True)
driver_standings = driver_standings.drop(columns=['Unnamed: 0', 'Unnamed: 6'])
driver_standings.set_index(['year'],inplace=True)
driver_standings['Nationality'] = driver_standings['Nationality'].astype('category')
driver_standings['Car'] = driver_standings['Car'].astype('category')
driver_standings['Driver'] = driver_standings['Driver'].astype('category')
driver_standings['Pos'] = pd.to_numeric(driver_standings['Pos'],errors="coerce")
winners_per_country = driver_standings.loc[driver_standings['Pos'] == 1].copy()
winners_per_country.dtypes
#Create pie chart
#Establish data to be used in plot
labels = ['British','German','Brazilian','Others']
values = [(winners_per_country['Nationality'].value_counts().iloc[0]),
(winners_per_country['Nationality'].value_counts().iloc[1]),
(winners_per_country['Nationality'].value_counts().iloc[2]),
# drv_per_contry['nationality'].iloc[3],
(winners_per_country['Nationality'].value_counts().sum())-(winners_per_country['Nationality'].value_counts().iloc[0:3].sum())]
colors = ['#FF754E', '#D3A0E8', '#5AA0FF', '#66E896','#FFFB4E']
trace = go.Pie(labels=labels,
values=values,
hoverinfo='label+percent',
textinfo='label+percent',
textfont=dict(size=16),
marker=dict(colors=colors,
line=dict(color='#000000', width=1)))
#show plot
plotly.offline.iplot([trace])